# ===----------------------------------------------------------------------=== #
# Copyright (c) 2025, Modular Inc. All rights reserved.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions:
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===----------------------------------------------------------------------=== #

from internal_utils import Table, TuningConfig


@fieldwise_init
@register_passable("trivial")
struct TuningConfigNvidia(TuningConfig):
    var M: Int
    var N: Int
    var K: Int

    var TUNE_BM: Int
    var TUNE_BN: Int
    var TUNE_BK: Int
    var TUNE_WM: Int
    var TUNE_WN: Int
    var TUNE_NUM_STAGES: Int
    var TUNE_NUM_K_PARTITIONS: Int
    var TUNE_NUM_WARP_K_PARTITIONS: Int
    var nranks: Int

    fn __str__(self) -> String:
        var s = List[String]()
        s += ["m:" + String(self.M)]
        s += ["n:" + String(self.N)]
        s += ["k:" + String(self.K)]
        s += ["bm:" + String(self.TUNE_BM)]
        s += ["bn:" + String(self.TUNE_BN)]
        return "/".join(s)


comptime configs: List[TuningConfigNvidia] = [
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128000,
        N=8192,
        K=14336,
        TUNE_BM=256,
        TUNE_BN=128,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128000,
        N=8192,
        K=3584,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128000,
        N=8192,
        K=7168,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128000,
        N=14336,
        K=8192,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128000,
        N=28672,
        K=8192,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128000,
        N=7168,
        K=8192,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128,
        N=8192,
        K=14336,
        TUNE_BM=128,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128,
        N=8192,
        K=3584,
        TUNE_BM=64,
        TUNE_BN=128,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128,
        N=8192,
        K=7168,
        TUNE_BM=64,
        TUNE_BN=128,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128,
        N=14336,
        K=8192,
        TUNE_BM=64,
        TUNE_BN=256,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128,
        N=28672,
        K=8192,
        TUNE_BM=128,
        TUNE_BN=128,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=128,
        N=7168,
        K=8192,
        TUNE_BM=64,
        TUNE_BN=128,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=4,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=16,
        N=8192,
        K=14336,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=16,
        N=8192,
        K=3584,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=3,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=4,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=16,
        N=8192,
        K=7168,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=4,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=16,
        N=14336,
        K=8192,
        TUNE_BM=64,
        TUNE_BN=128,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=16,
        N=28672,
        K=8192,
        TUNE_BM=64,
        TUNE_BN=256,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=16,
        N=7168,
        K=8192,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=4,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=256,
        N=8192,
        K=14336,
        TUNE_BM=128,
        TUNE_BN=128,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=256,
        N=8192,
        K=3584,
        TUNE_BM=128,
        TUNE_BN=128,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=256,
        N=8192,
        K=7168,
        TUNE_BM=128,
        TUNE_BN=128,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=256,
        N=14336,
        K=8192,
        TUNE_BM=256,
        TUNE_BN=128,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=256,
        N=28672,
        K=8192,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=256,
        N=7168,
        K=8192,
        TUNE_BM=128,
        TUNE_BN=128,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=32,
        N=8192,
        K=14336,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=32,
        N=8192,
        K=3584,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=3,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=4,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=32,
        N=8192,
        K=7168,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=32,
        N=14336,
        K=8192,
        TUNE_BM=128,
        TUNE_BN=128,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=32,
        N=28672,
        K=8192,
        TUNE_BM=64,
        TUNE_BN=256,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=32,
        N=7168,
        K=8192,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=3,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=4,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=64,
        N=8192,
        K=14336,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=64,
        N=8192,
        K=3584,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=3,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=4,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=64,
        N=8192,
        K=7168,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=64,
        N=14336,
        K=8192,
        TUNE_BM=128,
        TUNE_BN=128,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=64,
        N=28672,
        K=8192,
        TUNE_BM=64,
        TUNE_BN=256,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=64,
        N=7168,
        K=8192,
        TUNE_BM=64,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=3,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=4,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=65536,
        N=8192,
        K=14336,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=65536,
        N=8192,
        K=3584,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=65536,
        N=8192,
        K=7168,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=65536,
        N=14336,
        K=8192,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=65536,
        N=28672,
        K=8192,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=65536,
        N=7168,
        K=8192,
        TUNE_BM=128,
        TUNE_BN=256,
        TUNE_BK=32,
        TUNE_WM=64,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=1,
        N=8192,
        K=14336,
        TUNE_BM=16,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=2,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=1,
        N=8192,
        K=3584,
        TUNE_BM=16,
        TUNE_BN=128,
        TUNE_BK=64,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=3,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=1,
        N=8192,
        K=7168,
        TUNE_BM=16,
        TUNE_BN=256,
        TUNE_BK=64,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=3,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=1,
        N=14336,
        K=8192,
        TUNE_BM=16,
        TUNE_BN=64,
        TUNE_BK=32,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=4,
        TUNE_NUM_K_PARTITIONS=3,
        TUNE_NUM_WARP_K_PARTITIONS=4,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=1,
        N=28672,
        K=8192,
        TUNE_BM=16,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=3,
        TUNE_NUM_K_PARTITIONS=6,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=1,
        N=7168,
        K=8192,
        TUNE_BM=16,
        TUNE_BN=64,
        TUNE_BK=32,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=6,
        TUNE_NUM_K_PARTITIONS=3,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=8,
        N=8192,
        K=14336,
        TUNE_BM=16,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=8,
        N=8192,
        K=3584,
        TUNE_BM=16,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=8,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=8,
        N=8192,
        K=7168,
        TUNE_BM=16,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=8,
        N=14336,
        K=8192,
        TUNE_BM=16,
        TUNE_BN=128,
        TUNE_BK=64,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=4,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=8,
        N=28672,
        K=8192,
        TUNE_BM=16,
        TUNE_BN=128,
        TUNE_BK=64,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=3,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=1,
        nranks=2,
    ),
    # Automatically generated from [output.yaml]
    TuningConfigNvidia(
        M=8,
        N=7168,
        K=8192,
        TUNE_BM=16,
        TUNE_BN=64,
        TUNE_BK=64,
        TUNE_WM=16,
        TUNE_WN=64,
        TUNE_NUM_STAGES=5,
        TUNE_NUM_K_PARTITIONS=1,
        TUNE_NUM_WARP_K_PARTITIONS=2,
        nranks=8,
    ),
]


comptime TuningTableNvidia = Table(configs, "TuningTableNvidia")
